I want to transform an XML document into HTML. Some XML elements have links to others documents like:
In the H
Since I did't see here anything about perf which is a relatively new tool for profiling the kernel and user applications on Linux I decided to add this information.
First of all - this is a tutorial about Linux profiling with perf
You can use perf if your Linux Kernel is greater than 2.6.32 or oprofile if it is older. Both programs don't require from you to instrument your program (like gprof requires). However in order to get call graph correctly in perf you need to build you program with -fno-omit-frame-pointer. For example: g++ -fno-omit-frame-pointer -O2 main.cpp.
You can see "live" analysis of your application with perf top:
sudo perf top -p `pidof a.out` -K
Or you can record performance data of a running application and analyze them after that:
1) To record performance data:
perf record -p `pidof a.out`
or to record for 10 secs:
perf record -p `pidof a.out` sleep 10
or to record with call graph ()
perf record -g -p `pidof a.out`
2) To analyze the recorded data
perf report --stdio
perf report --stdio --sort=dso -g none
perf report --stdio -g none
perf report --stdio -g
Or you can record performace data of a application and analyze them after that just by launching the application in this way and waiting for it to exit:
perf record ./a.out
This is an example of profiling a test program
The test program is in file main.cpp (I will put main.cpp at the bottom of the message):
I compile it in this way:
g++ -m64 -fno-omit-frame-pointer -g main.cpp -L. -ltcmalloc_minimal -o my_test
I use libmalloc_minimial.so since it is compiled with -fno-omit-frame-pointer while libc malloc seems to be compiled without this option.
Then I run my test program
./my_test 100000000
Then I record performance data of a running process:
perf record -g -p `pidof my_test` -o ./my_test.perf.data sleep 30
Then I analyze load per module:
perf report --stdio -g none --sort comm,dso -i ./my_test.perf.data
# Overhead Command Shared Object
# ........ ....... ............................
#
70.06% my_test my_test
28.33% my_test libtcmalloc_minimal.so.0.1.0
1.61% my_test [kernel.kallsyms]
Then load per function is analyzed:
perf report --stdio -g none -i ./my_test.perf.data | c++filt
# Overhead Command Shared Object Symbol
# ........ ....... ............................ ...........................
#
29.30% my_test my_test [.] f2(long)
29.14% my_test my_test [.] f1(long)
15.17% my_test libtcmalloc_minimal.so.0.1.0 [.] operator new(unsigned long)
13.16% my_test libtcmalloc_minimal.so.0.1.0 [.] operator delete(void*)
9.44% my_test my_test [.] process_request(long)
1.01% my_test my_test [.] operator delete(void*)@plt
0.97% my_test my_test [.] operator new(unsigned long)@plt
0.20% my_test my_test [.] main
0.19% my_test [kernel.kallsyms] [k] apic_timer_interrupt
0.16% my_test [kernel.kallsyms] [k] _spin_lock
0.13% my_test [kernel.kallsyms] [k] native_write_msr_safe
and so on ...
Then call chains are analyzed:
perf report --stdio -g graph -i ./my_test.perf.data | c++filt
# Overhead Command Shared Object Symbol
# ........ ....... ............................ ...........................
#
29.30% my_test my_test [.] f2(long)
|
--- f2(long)
|
--29.01%-- process_request(long)
main
__libc_start_main
29.14% my_test my_test [.] f1(long)
|
--- f1(long)
|
|--15.05%-- process_request(long)
| main
| __libc_start_main
|
--13.79%-- f2(long)
process_request(long)
main
__libc_start_main
15.17% my_test libtcmalloc_minimal.so.0.1.0 [.] operator new(unsigned long)
|
--- operator new(unsigned long)
|
|--11.44%-- f1(long)
| |
| |--5.75%-- process_request(long)
| | main
| | __libc_start_main
| |
| --5.69%-- f2(long)
| process_request(long)
| main
| __libc_start_main
|
--3.01%-- process_request(long)
main
__libc_start_main
13.16% my_test libtcmalloc_minimal.so.0.1.0 [.] operator delete(void*)
|
--- operator delete(void*)
|
|--9.13%-- f1(long)
| |
| |--4.63%-- f2(long)
| | process_request(long)
| | main
| | __libc_start_main
| |
| --4.51%-- process_request(long)
| main
| __libc_start_main
|
|--3.05%-- process_request(long)
| main
| __libc_start_main
|
--0.80%-- f2(long)
process_request(long)
main
__libc_start_main
9.44% my_test my_test [.] process_request(long)
|
--- process_request(long)
|
--9.39%-- main
__libc_start_main
1.01% my_test my_test [.] operator delete(void*)@plt
|
--- operator delete(void*)@plt
0.97% my_test my_test [.] operator new(unsigned long)@plt
|
--- operator new(unsigned long)@plt
0.20% my_test my_test [.] main
0.19% my_test [kernel.kallsyms] [k] apic_timer_interrupt
0.16% my_test [kernel.kallsyms] [k] _spin_lock
and so on ...
So at this point you know where your program spends time.
And this is main.cpp for the test:
#include
#include
#include
time_t f1(time_t time_value)
{
for (int j =0; j < 10; ++j) {
++time_value;
if (j%5 == 0) {
double *p = new double;
delete p;
}
}
return time_value;
}
time_t f2(time_t time_value)
{
for (int j =0; j < 40; ++j) {
++time_value;
}
time_value=f1(time_value);
return time_value;
}
time_t process_request(time_t time_value)
{
for (int j =0; j < 10; ++j) {
int *p = new int;
delete p;
for (int m =0; m < 10; ++m) {
++time_value;
}
}
for (int i =0; i < 10; ++i) {
time_value=f1(time_value);
time_value=f2(time_value);
}
return time_value;
}
int main(int argc, char* argv2[])
{
int number_loops = argc > 1 ? atoi(argv2[1]) : 1;
time_t time_value = time(0);
printf("number loops %d\n", number_loops);
printf("time_value: %d\n", time_value );
for (int i =0; i < number_loops; ++i) {
time_value = process_request(time_value);
}
printf("time_value: %ld\n", time_value );
return 0;
}